clear all
set more off
*cd "/Users/jasonborsari/Downloads/Projectt"
*set working directory to project folder
* ************************************************************
* Step 1: Standardize and Clean CPS Data (Fix state names)
* ************************************************************
use "Data/IntermediateData/cleaned_cps_2010_2018.dta", clear

* Convert statefip (numeric) into full state names
gen state = ""
replace state = "Alabama" if statefip == 1
replace state = "Alaska" if statefip == 2
replace state = "Arizona" if statefip == 4
replace state = "Arkansas" if statefip == 5
replace state = "California" if statefip == 6
replace state = "Colorado" if statefip == 8
replace state = "Connecticut" if statefip == 9
replace state = "Delaware" if statefip == 10
replace state = "District of Columbia" if statefip == 11
replace state = "Florida" if statefip == 12
replace state = "Georgia" if statefip == 13
replace state = "Hawaii" if statefip == 15
replace state = "Idaho" if statefip == 16
replace state = "Illinois" if statefip == 17
replace state = "Indiana" if statefip == 18
replace state = "Iowa" if statefip == 19
replace state = "Kansas" if statefip == 20
replace state = "Kentucky" if statefip == 21
replace state = "Louisiana" if statefip == 22
replace state = "Maine" if statefip == 23
replace state = "Maryland" if statefip == 24
replace state = "Massachusetts" if statefip == 25
replace state = "Michigan" if statefip == 26
replace state = "Minnesota" if statefip == 27
replace state = "Mississippi" if statefip == 28
replace state = "Missouri" if statefip == 29
replace state = "Montana" if statefip == 30
replace state = "Nebraska" if statefip == 31
replace state = "Nevada" if statefip == 32
replace state = "New Hampshire" if statefip == 33
replace state = "New Jersey" if statefip == 34
replace state = "New Mexico" if statefip == 35
replace state = "New York" if statefip == 36
replace state = "North Carolina" if statefip == 37
replace state = "North Dakota" if statefip == 38
replace state = "Ohio" if statefip == 39
replace state = "Oklahoma" if statefip == 40
replace state = "Oregon" if statefip == 41
replace state = "Pennsylvania" if statefip == 42
replace state = "Rhode Island" if statefip == 44
replace state = "South Carolina" if statefip == 45
replace state = "South Dakota" if statefip == 46
replace state = "Tennessee" if statefip == 47
replace state = "Texas" if statefip == 48
replace state = "Utah" if statefip == 49
replace state = "Vermont" if statefip == 50
replace state = "Virginia" if statefip == 51
replace state = "Washington" if statefip == 53
replace state = "West Virginia" if statefip == 54
replace state = "Wisconsin" if statefip == 55
replace state = "Wyoming" if statefip == 56

* Drop the original FIPS code column
drop statefip

* Ensure `year` is numeric
destring year, replace force

* Save cleaned CPS dataset
save "Data/IntermediateData/cleaned_cps_fixed.dta", replace

* ************************************************************
* Step 2: Reshape and Merge Income & Disposable Income Data
* ************************************************************

* Fix SAINC1 (Personal Income)

use "Data/IntermediateData/SAINC1_clean.dta", clear

* Drop first 4 rows (assuming they are useless)
drop in 1/4

* Drop first column if it exists
capture drop SAINC1Stateannualpersonalinc

* Keep only `linecode == 1` (Personal income only)
keep if linecode == "1"

* Rename year variables properly for reshaping
rename year_2010 income_2010
rename year_2011 income_2011
rename year_2012 income_2012
rename year_2013 income_2013
rename year_2014 income_2014
rename year_2015 income_2015
rename year_2016 income_2016
rename year_2017 income_2017
rename year_2018 income_2018

* Reshape wide data to long format
reshape long income_, i(state) j(year)

* Ensure `year` is numeric
destring year, replace force

* Save cleaned version
save "Data/IntermediateData/SAINC1_long_fixed.dta", replace

* Fix SAINC51 (Disposable Income)

use "Data/IntermediateData/SAINC51_clean.dta", clear

* Drop first 4 rows (assuming they are useless)
drop in 1/4

* Drop first column if it exists
capture drop SAINC51Stateannualpersonalinc

* Keep only `linecode == 51` (Disposable personal income only)
keep if linecode == "51"

* Rename year variables properly for reshaping
rename year_2010 disposable_income_2010
rename year_2011 disposable_income_2011
rename year_2012 disposable_income_2012
rename year_2013 disposable_income_2013
rename year_2014 disposable_income_2014
rename year_2015 disposable_income_2015
rename year_2016 disposable_income_2016
rename year_2017 disposable_income_2017
rename year_2018 disposable_income_2018

* Reshape wide data to long format
reshape long disposable_income_, i(state) j(year)

* Ensure `year` is numeric
destring year, replace force

* Save cleaned version
save "Data/IntermediateData/SAINC51_long_fixed.dta", replace

* Merge the cleaned datasets

use "Data/IntermediateData/SAINC1_long_fixed.dta", clear

merge 1:1 state year using "Data/IntermediateData/SAINC51_long_fixed.dta"

* Check merge results
tab _merge

* Drop observations that only exist in SAINC51 (if any)
drop if _merge == 2
drop _merge

* Save the final merged income dataset
save "Data/IntermediateData/income_merged.dta", replace

************************************************************
* Step 3: Merge CPS Employment Data
************************************************************

* Load and Prepare CPS Data
use "Data/IntermediateData/cleaned_cps_2010_2018.dta", clear

* Generate `state` names from `statefip`
generate state = ""

replace state = "Alabama" if statefip == 1
replace state = "Alaska" if statefip == 2
replace state = "Arizona" if statefip == 4
replace state = "Arkansas" if statefip == 5
replace state = "California" if statefip == 6
replace state = "Colorado" if statefip == 8
replace state = "Connecticut" if statefip == 9
replace state = "Delaware" if statefip == 10
replace state = "District of Columbia" if statefip == 11
replace state = "Florida" if statefip == 12
replace state = "Georgia" if statefip == 13
replace state = "Hawaii" if statefip == 15
replace state = "Idaho" if statefip == 16
replace state = "Illinois" if statefip == 17
replace state = "Indiana" if statefip == 18
replace state = "Iowa" if statefip == 19
replace state = "Kansas" if statefip == 20
replace state = "Kentucky" if statefip == 21
replace state = "Louisiana" if statefip == 22
replace state = "Maine" if statefip == 23
replace state = "Maryland" if statefip == 24
replace state = "Massachusetts" if statefip == 25
replace state = "Michigan" if statefip == 26
replace state = "Minnesota" if statefip == 27
replace state = "Mississippi" if statefip == 28
replace state = "Missouri" if statefip == 29
replace state = "Montana" if statefip == 30
replace state = "Nebraska" if statefip == 31
replace state = "Nevada" if statefip == 32
replace state = "New Hampshire" if statefip == 33
replace state = "New Jersey" if statefip == 34
replace state = "New Mexico" if statefip == 35
replace state = "New York" if statefip == 36
replace state = "North Carolina" if statefip == 37
replace state = "North Dakota" if statefip == 38
replace state = "Ohio" if statefip == 39
replace state = "Oklahoma" if statefip == 40
replace state = "Oregon" if statefip == 41
replace state = "Pennsylvania" if statefip == 42
replace state = "Rhode Island" if statefip == 44
replace state = "South Carolina" if statefip == 45
replace state = "South Dakota" if statefip == 46
replace state = "Tennessee" if statefip == 47
replace state = "Texas" if statefip == 48
replace state = "Utah" if statefip == 49
replace state = "Vermont" if statefip == 50
replace state = "Virginia" if statefip == 51
replace state = "Washington" if statefip == 53
replace state = "West Virginia" if statefip == 54
replace state = "Wisconsin" if statefip == 55
replace state = "Wyoming" if statefip == 56

drop statefip

* Ensure `year` is numeric
destring year, replace force

* Trim unnecessary spaces in `state`
replace state = trim(ustrtrim(state))

* Compute employment-related variables if missing
capture confirm variable employment_population_ratio
if _rc != 0 {
    generate employment_population_ratio = (empstat == 1) if empstat != .
}

capture confirm variable labor_force_participation_rate
if _rc != 0 {
    generate labor_force_participation_rate = (empstat == 1 | empstat == 2) if empstat != .
}

* Collapse CPS data to state-year level
collapse (mean) employment_population_ratio labor_force_participation_rate, by(state year)

* Ensure uniqueness before merging
duplicates report state year

* Save cleaned CPS dataset
save "Data/IntermediateData/cleaned_cps_aggregated.dta", replace

* Merge CPS Data with Income Data
use "Data/IntermediateData/income_merged.dta", clear

* Trim spaces in `state`
replace state = trim(ustrtrim(state))

* Ensure uniqueness before merging
duplicates report state year

* Merge (many-to-one) CPS data onto income dataset
merge m:1 state year using "Data/IntermediateData/cleaned_cps_aggregated.dta"

* Check and handle merge issues
tab _merge

* Drop observations only in CPS dataset (if any)
drop if _merge == 2
drop _merge

* Save the updated merged dataset
save "Data/IntermediateData/income_cps_merged.dta", replace

************************************************************
* Step 4: Merge Minimum Wage Data
************************************************************

* Load and Prepare Minimum Wage Data

use "Data/IntermediateData/cleaned_min_wage.dta", clear

* Ensure `year` is numeric
destring year, replace force

* Convert state abbreviations to full names
generate state_full = ""

replace state_full = "Alaska" if state == "AK"
replace state_full = "Arizona" if state == "AZ"
replace state_full = "Arkansas" if state == "AR"
replace state_full = "California" if state == "CA"
replace state_full = "Colorado" if state == "CO"
replace state_full = "Connecticut" if state == "CT"
replace state_full = "Delaware" if state == "DE"
replace state_full = "Florida" if state == "FL"
replace state_full = "Georgia" if state == "GA"
replace state_full = "Hawaii" if state == "HI"
replace state_full = "Idaho" if state == "ID"
replace state_full = "Illinois" if state == "IL"
replace state_full = "Indiana" if state == "IN"
replace state_full = "Iowa" if state == "IA"
replace state_full = "Kansas" if state == "KS"
replace state_full = "Kentucky" if state == "KY"
replace state_full = "Maine" if state == "ME"
replace state_full = "Maryland" if state == "MD"
replace state_full = "Massachusetts" if state == "MA"
replace state_full = "Michigan" if state == "MI"
replace state_full = "Minnesota" if state == "MN"
replace state_full = "Missouri" if state == "MO"
replace state_full = "Montana" if state == "MT"
replace state_full = "Nebraska" if state == "NE"
replace state_full = "Nevada" if state == "NV"
replace state_full = "New Hampshire" if state == "NH"
replace state_full = "New Jersey" if state == "NJ"
replace state_full = "New Mexico" if state == "NM"
replace state_full = "New York" if state == "NY"
replace state_full = "North Carolina" if state == "NC"
replace state_full = "North Dakota" if state == "ND"
replace state_full = "Ohio" if state == "OH"
replace state_full = "Oregon" if state == "OR"
replace state_full = "Pennsylvania" if state == "PA"
replace state_full = "Rhode Island" if state == "RI"
replace state_full = "South Dakota" if state == "SD"
replace state_full = "Texas" if state == "TX"
replace state_full = "Utah" if state == "UT"
replace state_full = "Vermont" if state == "VT"
replace state_full = "Virginia" if state == "VA"
replace state_full = "Washington" if state == "WA"
replace state_full = "West Virginia" if state == "WV"
replace state_full = "Wisconsin" if state == "WI"
replace state_full = "Wyoming" if state == "WY"

* Drop the old state abbreviation column
drop state
rename state_full state

* Trim any extra spaces
replace state = trim(ustrtrim(state))

* Save cleaned minimum wage data
save "Data/IntermediateData/cleaned_min_wage_fixed.dta", replace

clear
input str20 state int year float state_min_wage
"Alabama" 2010 7.25
"Alabama" 2011 7.25
"Alabama" 2012 7.25
"Alabama" 2013 7.25
"Alabama" 2014 7.25
"Alabama" 2015 7.25
"Alabama" 2016 7.25
"Alabama" 2017 7.25
"Alabama" 2018 7.25
"Louisiana" 2010 7.25
"Louisiana" 2011 7.25
"Louisiana" 2012 7.25
"Louisiana" 2013 7.25
"Louisiana" 2014 7.25
"Louisiana" 2015 7.25
"Louisiana" 2016 7.25
"Louisiana" 2017 7.25
"Louisiana" 2018 7.25
"Mississippi" 2010 7.25
"Mississippi" 2011 7.25
"Mississippi" 2012 7.25
"Mississippi" 2013 7.25
"Mississippi" 2014 7.25
"Mississippi" 2015 7.25
"Mississippi" 2016 7.25
"Mississippi" 2017 7.25
"Mississippi" 2018 7.25
"Oklahoma" 2010 7.25
"Oklahoma" 2011 7.25
"Oklahoma" 2012 7.25
"Oklahoma" 2013 7.25
"Oklahoma" 2014 7.25
"Oklahoma" 2015 7.25
"Oklahoma" 2016 7.25
"Oklahoma" 2017 7.25
"Oklahoma" 2018 7.25
"South Carolina" 2010 7.25
"South Carolina" 2011 7.25
"South Carolina" 2012 7.25
"South Carolina" 2013 7.25
"South Carolina" 2014 7.25
"South Carolina" 2015 7.25
"South Carolina" 2016 7.25
"South Carolina" 2017 7.25
"South Carolina" 2018 7.25
"Tennessee" 2010 7.25
"Tennessee" 2011 7.25
"Tennessee" 2012 7.25
"Tennessee" 2013 7.25
"Tennessee" 2014 7.25
"Tennessee" 2015 7.25
"Tennessee" 2016 7.25
"Tennessee" 2017 7.25
"Tennessee" 2018 7.25
"District of Columbia" 2010 8.25
"District of Columbia" 2011 8.25
"District of Columbia" 2012 8.25
"District of Columbia" 2013 8.25
"District of Columbia" 2014 9.50
"District of Columbia" 2015 10.50
"District of Columbia" 2016 11.50
"District of Columbia" 2017 12.50
"District of Columbia" 2018 13.25
end

* Append to cleaned minimum wage dataset
append using "Data/IntermediateData/cleaned_min_wage_fixed.dta"

* Save updated minimum wage dataset
save "Data/IntermediateData/cleaned_min_wage_fixed.dta", replace

* Merge Minimum Wage Data with Analysis Dataset

use "Data/IntermediateData/income_cps_merged.dta", clear

* Ensure `state` formatting is consistent
replace state = trim(ustrtrim(state))

* Merge (many-to-one) Minimum Wage Data onto analysis dataset
merge m:1 state year using "Data/IntermediateData/cleaned_min_wage_fixed.dta"

* Drop unmatched using observations
drop if _merge == 2
drop _merge

* Save the updated merged dataset
save "Data/IntermediateData/income_cps_minwage_merged.dta", replace

* ************************************************************
* Step 5: Merge Unemployment Data (with Fix)
* ************************************************************

* Load and Prepare Unemployment Data
use "Data/IntermediateData/unemployment_data.dta", clear

* Convert state names to proper format
replace state = proper(state)

* Fix incorrect multi-word state names
replace state = "New Hampshire" if state == "Newhampshire"
replace state = "New Jersey" if state == "Newjersey"
replace state = "New Mexico" if state == "Newmexico"
replace state = "New York" if state == "Newyork"
replace state = "North Carolina" if state == "Northcarolina"
replace state = "North Dakota" if state == "Northdakota"
replace state = "Rhode Island" if state == "Rhodeisland"
replace state = "South Carolina" if state == "Southcarolina"
replace state = "South Dakota" if state == "Southdakota"
replace state = "West Virginia" if state == "Westvirginia"

* Fix District of Columbia naming issue
replace state = "District of Columbia" if state == "District"

* Trim extra spaces
replace state = trim(ustrtrim(state))

* Ensure `year` is numeric
destring year, replace force

* Drop extra year (2019) if present
drop if year > 2018

* Collapse to state-year level (annual averages)
collapse (mean) unemployment_rate labor_force_participation_rate employment_population_ratio, by(state year)

* Save cleaned unemployment dataset
save "Data/IntermediateData/cleaned_unemployment.dta", replace

* Merge Unemployment Data with Main Dataset
use "Data/IntermediateData/income_cps_minwage_merged.dta", clear

* Drop incorrect columns before merging again
drop employment_population_ratio labor_force_participation_rate

* Ensure state formatting is consistent
replace state = trim(ustrtrim(state))

* Merge (many-to-one) unemployment data
merge m:1 state year using "Data/IntermediateData/cleaned_unemployment.dta"

* Check merge results
tab _merge

* Drop unmatched using observations
drop if _merge == 2
drop _merge

* Save final merged dataset
save "Data/AnalysisData/final_analysis_data.dta", replace
